home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The 640 MEG Shareware Studio 4
/
The 640 Meg Shareware Studio CD-ROM Volume IV (Data Express)(1994).ISO
/
wp
/
gsar106.zip
/
GSARBMG.C
< prev
next >
Wrap
C/C++ Source or Header
|
1993-06-01
|
18KB
|
509 lines
/* gsarbmg.c *************************************** UPDATED: 930325.18:45 TT
*
* Subroutines for fast string searching; no regular expressions
*
* Adapted from:
*
* Boyer/Moore/Gosper-assisted 'egrep' search, with delta0 table as in
* original paper (CACM, October, 1977). No delta1 or delta2. According to
* experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of minimal
* practical value. However, to improve for worst case input, integrating
* the improved Galil strategies (Apostolico/Giancarlo, Siam. J. Comput.,
* February 1986) deserves consideration.
*
* James A. Woods
* NASA Ames Research Center
*
* 29 April 1986 Jeff Mogul Stanford
* Adapted as a set of subroutines for use by a program. No
* regular-expression support.
*
* 12 Febuary 1992 Tormod Tjaberg
* Used parts of the original routines to implement extremely fast
* file search & replace mechanisms on ASCII & non ASCII files taking
* care not to 'chop' up the search pattern.
*
* Note:
*
* If a file consists of the following bytes: 'abrabra' a search for
* 'abra' will yield two matches. However if we are to replace 'abra'
* with 'foobar' only one occurrence will be changed and the output
* file will contain 'foobarbra'.
*
* Changes:
* 930131 : Corrected obscure bug in BMG search function. In some cases the
* character after the actual end of the buffer ( very small file )
* could result in a 'LARGE' jump outside the buffer. The solution
* is to make sure that *(strend+1) != *strend.
* 930104 : Corrected bug in BMGSearchReplace function. If the special case
* described above was true 'n' ( number of bytes to write from
* search buffer ) would become negative, resulting in a huge file.
* 920514 : Cleaned up some code, made fwrite more 'consistent'
* made the search buffer preallocated
* 920404 : BMG routines use compile time generated table instead of
* filling it in at run time
*
* Currently compiles under:
* Turbo C 2.0, Turbo C++ 1.0, Turbo C++ 3.0, Zortech C++ 3.0,
* Watcom C 386 8.0, Ultrix ANSI C, Microsoft 6.0, GCC
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <sys/types.h>
#include "comp_dep.h"
#include "gsar.h"
#define LARGE BUFSIZ + PAT_BUFSIZ /* overshoot purposes */
/* Variables needed to perform the BMG search. To gain some speed the ASCII
* table is set up at compile time. Furthermore the buffer is preallocated
*/
int bmg_patlen; /* length of pattern */
unsigned char bmg_pattern[ PAT_BUFSIZ ]; /* actual pattern */
int bmg_delta0[ 256 ]; /* ascii only */
unsigned char bmg_buffer[ BUFSIZ+PAT_BUFSIZ+2 ]; /* search buffer */
unsigned char bmg_cmap[ 256 ] = /* (un)folded characters */
{ 0 , 1, 2 , 3, 4, 5 , 6, 7, 8, 9, 10, 11,
12 , 13, 14 , 15, 16, 17 , 18, 19, 20, 21, 22, 23,
24 , 25, 26 , 27, 28, 29 , 30, 31, 32, 33, 34, 35,
36 , 37, 38 , 39, 40, 41 , 42, 43, 44, 45, 46, 47,
48 , 49, 50 , 51, 52, 53 , 54, 55, 56, 57, 58, 59,
60 , 61, 62 , 63, 64, 65 , 66, 67, 68, 69, 70, 71,
72 , 73, 74 , 75, 76, 77 , 78, 79, 80, 81, 82, 83,
84 , 85, 86 , 87, 88, 89 , 90, 91, 92, 93, 94, 95,
96 , 97, 98 , 99, 100, 101 ,102, 103, 104, 105, 106, 107,
108 ,109, 110 ,111, 112, 113 ,114, 115, 116, 117, 118, 119,
120 ,121, 122 ,123, 124, 125 ,126, 127, 128, 129, 130, 131,
132 ,133, 134 ,135, 136, 137 ,138, 139, 140, 141, 142, 143,
144 ,145, 146 ,147, 148, 149 ,150, 151, 152, 153, 154, 155,
156 ,157, 158 ,159, 160, 161 ,162, 163, 164, 165, 166, 167,
168 ,169, 170 ,171, 172, 173 ,174, 175, 176, 177, 178, 179,
180 ,181, 182 ,183, 184, 185 ,186, 187, 188, 189, 190, 191,
192 ,193, 194 ,195, 196, 197 ,198, 199, 200, 201, 202, 203,
204 ,205, 206 ,207, 208, 209 ,210, 211, 212, 213, 214, 215,
216 ,217, 218 ,219, 220, 221 ,222, 223, 224, 225, 226, 227,
228 ,229, 230 ,231, 232, 233 ,234, 235, 236, 237, 238, 239,
240 ,241, 242 ,243, 244, 245 ,246, 247, 248, 249, 250, 251,
252 ,253, 254 ,255
};
/* function prototypes internal to this module */
void Verbose( OUTPUT_CTRL *, unsigned long , int ,
unsigned char *, unsigned char * );
/* Verbose()
*
* Input : pCtrl - pointer to structure containg output and ctrl info
* FileOfs - actual offset in file
* BufOfs - match offset in search buffer
* pStart - pointer to start of the search buffer
* pEnd - pointer to end of the search buffer
*
* Returns: nothing
*
* Displays buffer information ( filename, offset, context ) according
* to the flags set in the structure.. i.e. be a bit verbose.
*/
void Verbose( OUTPUT_CTRL *pCtrl, unsigned long FileOfs, int BufOfs,
unsigned char *pStart, unsigned char *pEnd )
{
unsigned char *pSC; /* start of context */
unsigned char *pEC; /* end of context */
unsigned char *pLastSC; /* last start of context */
unsigned long CtxOfs; /* context offset */
int i; /* loop counter */
if ( pCtrl->fFileSpec ) /* display file name */
fprintf( pCtrl->fpMsg, "%s : ", pCtrl->pInputFile );
if ( pCtrl->fByteOffset ) /* display byte offset */
fprintf( pCtrl->fpMsg, "0x%lx%s",
FileOfs + BufOfs,
( pCtrl->fTextual ) ? " : " : "" );
/* Display a textual or a hexadecimal context
*/
if ( pCtrl->fTextual || pCtrl->fHex )
{
pSC = pStart + BufOfs - pCtrl->Context/2 + bmg_patlen/2 ;
if ( pSC < pStart ) /* outside the buffer ? */
pSC = pStart;
pEC = pSC + pCtrl->Context;
if ( pEC > pEnd ) /* outside the buffer ? */
{
pEC = pEnd;
/* if we have to truncate to pEnd readjust the start
* of the context if possible.
*/
if ( pEC - pCtrl->Context > pStart )
pSC = pEC - pCtrl->Context;
}
/* display a hexadecimal context
*/
if ( pCtrl->fHex )
{
fputc( '\n', pCtrl->fpMsg );
CtxOfs = FileOfs + ( pSC - pStart );
while ( pSC != pEC )
{
pLastSC = pSC; /* remember where we started */
fprintf(pCtrl->fpMsg,"0x%08lx: ", CtxOfs ); /* hex offset */
for ( i = 0 ; i < 16 ; i++ ) /* display 16 hex digits */
{
if ( pSC != pEC )
fprintf(pCtrl->fpMsg,"%02x ", ( unsigned char ) *pSC++ );
else
fprintf(pCtrl->fpMsg," ");
}
pSC = pLastSC; /* start again */
for ( i = 0 ; i < 16 ; i++ ) /* display 16 characters */
{
if ( pSC != pEC )
{
#ifdef MSDOS /* MSDOS can display all characters except CTRL chars */
if ( !iscntrl( (int) *pSC ) )
#else /* its __UNIX__ */
if ( isprint( (int) *pSC ) )
#endif
fputc( *pSC, pCtrl->fpMsg );
else
fputc( '.', pCtrl->fpMsg );
pSC++;
}
}
CtxOfs += 16; /* increment context offset by 16 */
fputc('\n',pCtrl->fpMsg);
}
}
/* display textual context...
*/
if ( pCtrl->fTextual )
{
while ( pSC != pEC )
{
#ifdef MSDOS /* MSDOS can display all characters except CTRL chars */
if ( !iscntrl( (int) *pSC ) )
#else /* its __UNIX__ */
if ( isprint( (int) *pSC ) )
#endif
fputc( *pSC, pCtrl->fpMsg );
else
fputc( '.', pCtrl->fpMsg );
pSC++;
}
}
}
if ( !pCtrl->fHex )
fputc('\n',pCtrl->fpMsg);
}
/* BMGSearch()
*
* Input : pCtrl - pointer to structure containg output and ctrl info
* Returns : number of matches found in file
*
* The pattern to search for must already have been set up using BMGSetup
*
* Works by applying BMG algorithm to a buffer. To ensure the pattern is not
* inadvertently chopped up, patlen -1 bytes is always moved to the
* start of the buffer. And the start of the buffer moved backwards
* accordingly. If the last match was within the patlen - 1 bytes at
* the end, only the remaining bytes are transferred.
*/
long BMGSearch( OUTPUT_CTRL *pCtrl )
{
register unsigned char *k;
register unsigned char *s;
register unsigned char *strend;
register int j;
int nTrans = 0; /* number of bytes to transfer to the start of the buffer */
int BufOfs; /* buffer offset for each match */
int BufLen; /* actual length of buffer + nTrans */
long nMatches = 0; /* number of matches found */
unsigned char *pStart; /* Start of buffer to search */
unsigned char *pEnd; /* End of buffer to search */
unsigned char *pFileBuf; /* Start of file contents */
unsigned long nBytes; /* number of bytes read */
unsigned long FileOfs = 0; /* current file offset */
pStart = &bmg_buffer[ bmg_patlen - 1 ]; /* past the pattern at the start */
pFileBuf = pStart; /* where to store the file block */
for (;;)
{
nBytes = (unsigned long) fread( pFileBuf, 1,( size_t ) BUFSIZ, pCtrl->fpIn );
pEnd = pFileBuf + nBytes;
BufOfs = -1; /* -1 is returned if no match found */
s = pStart;
BufLen = nBytes + nTrans;
strend = pStart + BufLen;
*(strend+1) = ~(*strend); /* sentinel to prevent jump out of buffer */
k = pStart + bmg_patlen - 1;
for (;;)
{
while ((k += bmg_delta0[ *(unsigned char *) k ]) < strend) ;
if (k < ( pStart + LARGE))
break;
k -= LARGE;
j = bmg_patlen - 1;
s = k - 1;
while (bmg_cmap[*s--] == bmg_pattern[--j]) ;
if (j >= 0)
k++;
else
{ /* found submatch, k is on the last letter in the match */
BufOfs = k - pStart + 1 - bmg_patlen;
nMatches++;
if ( pCtrl->fVerbose )
Verbose( pCtrl , FileOfs, BufOfs, pStart, pEnd );
k++;
if (k >= strend)
break;
}
}
if ( BufOfs == -1 ) /* no match transfer patlen - 1 bytes to buffer start */
nTrans = bmg_patlen -1; /* to be sure that we don't skip a pattern */
else /* we have a match, but where ? */
{
nTrans = pEnd - ( pStart + BufOfs + bmg_patlen ); /* calculate the remaining buffer */
/* space after last match */
if ( nTrans >= bmg_patlen ) /* not between End & End - patlen -1 */
nTrans = bmg_patlen - 1; /* Transfer patlen - 1 bytes */
}
FileOfs = FileOfs + BufLen - nTrans; /* calculate file offset */
if ( nTrans == 0 ) /* match at exact end of buffer */
pStart = pFileBuf;
else
{
pStart = pFileBuf - nTrans; /* move start pointer accordingly */
memcpy( pStart, pEnd - nTrans, nTrans ); /* move remaining bytes to the start */
}
if ( feof( pCtrl->fpIn ) )
break;
}
return nMatches;
}
/* BMGSearchReplace()
*
* Input : pCtrl - pointer to structure containg output and ctrl info
* ReplaceBuf - pointer to buffer which contains replacement
* nReplace - number of bytes in replace buffer
*
* Returns : number of matches & replaces performed
* -1 if error in fwrite, disk might be full, or removed
*
* The pattern to search for must already have been set up using BMGSetup
*
* Works by applying BMG algorithm to a buffer. To ensure the pattern is not
* inadvertently chopped up, patlen -1 bytes is always moved to the
* start of the buffer. And the pointer to the start of the buffer moved
* backwards accordingly. If the last match was within the patlen - 1 bytes
* at the end, only the remaining bytes are transferred.
*/
long BMGSearchReplace( OUTPUT_CTRL *pCtrl, char * ReplaceBuf, unsigned short nReplace )
{
register unsigned char *k;
register unsigned char *strend;
register unsigned char *s;
register int j;
register int n; /* n is number of bytes to write */
int nTrans = 0; /* number of bytes to transfer to the start of the buffer */
int BufOfs; /* buffer offset for each match */
int BufLen; /* actual length of buffer + nTrans */
long nMatches = 0; /* number of matches found */
unsigned char *pLast; /* Where to write the replacement from */
unsigned char *pStart; /* Start of buffer to search */
unsigned char *pEnd; /* End of buffer to search */
unsigned char *pFileBuf; /* Start of file contents, constant throughout
the entire search */
unsigned long nBytes; /* number of bytes read */
unsigned long FileOfs = 0; /* current file offset */
pStart = &bmg_buffer[ bmg_patlen - 1 ]; /* past the pattern at the start */
pFileBuf = pStart; /* where to store the file block */
for (;;)
{
nBytes = (unsigned long) fread( pFileBuf, 1,( size_t ) BUFSIZ, pCtrl->fpIn );
pEnd = pFileBuf + nBytes;
BufOfs = -1; /* -1 is returned if no match found */
pLast = s = pStart;
BufLen = nBytes + nTrans;
strend = pStart + BufLen;
*(strend+1) = ~(*strend); /* sentinel to prevent jump out of buffer */
k = pStart + bmg_patlen - 1;
for (;;)
{
while ((k += bmg_delta0[ *(unsigned char *) k ]) < strend) ;
if (k < ( pStart + LARGE))
break;
k -= LARGE;
j = bmg_patlen - 1;
s = k - 1;
while (bmg_cmap[*s--] == bmg_pattern[--j]) ;
if (j >= 0)
k++;
else
{ /* found submatch, k is on the last letter in the match */
BufOfs = k - pStart + 1 - bmg_patlen;
/* number of bytes to write from buffer */
n = ( pStart + BufOfs ) - pLast;
if ( n >= 0 )
{
nMatches++;
if ( pCtrl->fVerbose )
Verbose( pCtrl , FileOfs, BufOfs, pStart, pEnd );
if ( fwrite( pLast, ( size_t ) 1, n, pCtrl->fpOut ) != n )
return -1;
/* write replacement array, return -1 if error */
if ( fwrite( ReplaceBuf, ( size_t ) 1, nReplace, pCtrl->fpOut ) != nReplace )
return -1;
k++;
pLast = k; /* set last marker to write from */
}
else
k++; /* special case..see header */
if (k >= strend)
break;
}
}
if ( BufOfs == -1 ) /* no match transfer patlen - 1 bytes to buffer start */
nTrans = bmg_patlen -1; /* to be sure that we don't ignore a pattern */
else /* we have a match, but where ? */
{
nTrans = pEnd - ( pStart + BufOfs + bmg_patlen ); /* calculate the remaining buffer */
/* space after last match */
if ( nTrans >= bmg_patlen ) /* not between End & End - patlen -1 */
nTrans = bmg_patlen - 1; /* Transfer patlen - 1 bytes */
}
FileOfs = FileOfs + BufLen - nTrans; /* calculate file offset */
if ( !feof( pCtrl->fpIn ) )
{
/* write the remainder of the buffer, return -1 if error */
n = ( pEnd - nTrans ) - pLast;
if ( fwrite( pLast, ( size_t ) 1, n, pCtrl->fpOut ) != n )
return -1;
}
else
{
/* end of file, dump the rest of the buffer, return -1 if error */
n = pEnd - pLast;
if ( fwrite( pLast, ( size_t ) 1, n, pCtrl->fpOut ) != n)
return -1;
break; /* for */
}
if ( nTrans == 0 ) /* match at exact end of buffer */
pStart = pFileBuf;
else
{
pStart = pFileBuf - nTrans; /* move start pointer accordingly */
memcpy( pStart, pEnd - nTrans, nTrans ); /* move remaining bytes to the start */
}
}
return nMatches;
}
/* BMGSetup()
*
* Input : pat - pointer to pattern string
* PatLen - actual length of the pattern
* fFolded - flag which determines case folding
* Returns : nothing
*
* Set up & compute Boyer-Moore delta ( jump ) table
*/
void BMGSetup( char *pat, int PatLen, char fFolded)
{
register int j;
bmg_patlen = PatLen;
if (fFolded)
{ /* fold case while saving pattern */
for (j = 0; j < bmg_patlen; j++)
bmg_pattern[j] = (isupper((int) pat[j])
? ( unsigned char) tolower((int) pat[j]) : pat[j]);
}
else
memcpy( bmg_pattern, ( unsigned char * )pat, bmg_patlen );
/* initialisation of bmg_cmap is done at compile time */
for (j = 0; j < 256; j++)
bmg_delta0[j] = bmg_patlen;
for (j = 0; j < bmg_patlen - 1; j++)
bmg_delta0[bmg_pattern[j]] = bmg_patlen - j - 1;
bmg_delta0[bmg_pattern[bmg_patlen - 1]] = LARGE;
if (fFolded)
{
for (j = 0; j < bmg_patlen - 1; j++)
if (islower((int) bmg_pattern[j]))
bmg_delta0[toupper((int) bmg_pattern[j])] = bmg_patlen - j - 1;
if (islower((int) bmg_pattern[bmg_patlen - 1]))
bmg_delta0[toupper((int) bmg_pattern[bmg_patlen - 1])] = LARGE;
for (j = 'A'; j <= 'Z'; j++)
bmg_cmap[j] = ( unsigned char) tolower((int) j);
}
}